1. Read Libraries
# Read Libraries
library(lubridate)
library(dplyr)
library(tidyverse)
library(forecast)
library(ggplot2)
library(scales)
library(TSstudio)
library(prettydoc)2. Read the Data
# Read Data
data <- read_csv("trainori.csv")3. Data Wrangling
First, we have to check our data structure and its contents, so that we could proceed our analysis at ease.
head(data)As shown above, there are many transactions that occur, however, we need to merge the same receipt number into one receipt number, so that we may have a more accurate forecast for our forecasting model.
We do this by grouping the transaction date and receipt number and summarise it by with ‘summarise’ and use ‘count = n_distinct’ to distinguish which item falls to the same receipt number and are classed as 1 receipt.
# Data Aggregation
data <- data %>%
mutate(transaction_date = floor_date(transaction_date, unit = "hour")) %>%
group_by(transaction_date) %>%
summarise(count = n_distinct(receipt_number)) %>%
ungroup()
dataFurther to that, using the function ‘pad’ to select the range of the data that is being used.
library(padr)
data_complete <- pad(x = data,start_val = range(data$transaction_date)[1],end_val = range(data$transaction_date)[2])Check if there are any NA’s in the data
anyNA(data_complete)#> [1] TRUE
colSums(is.na(data_complete))#> transaction_date count
#> 0 663
We now replace the NA to 0 and we filter in which the FNB restaurant opens at, and remove any hours in which the restaurant does not operate in order to provide clearer forecasting.
data_complete <- data_complete %>%
mutate(count = replace_na(count,0),
hour = hour(transaction_date)) %>%
filter(hour %in% c(10:22)) %>%
select(-hour)
data_completeWe will then proceed to creating our time series object.
# Create time series object
data_ts<-ts(data_complete$count, frequency = 13*7)
data_ts %>%
autoplot4. Decomposition of Time - Series
# Decompose time series object to observe the data
data_ts %>%
tail(13*7*4) %>%
ts_decompose() We can see above from our decomposed plot, the trend shows that this cannot be done in single seasonal forecasting. This can be seen from the Trend category, where the plot shows an increase and decrease throughout the plot.
We then convert the graph to multi-seasonality time series.
# convert to multi-seasonal
msts_visitor <- msts(data_complete$count, seasonal.periods = c(13, 13*7))# Decomposed multi-seasonal result
msts_visitor %>%
tail(13*7*4) %>%
mstl() %>%
autoplotWe can see above in the plot, that the visualization of hourly, daily, and weekly.
5. Cross - Validation
After decomposing, we will proceed in cross-validating the data by creating a new object: 1. train 2. test
# Cross Validation of Data
test <- tail(data_complete, 13*7)
train <- head(data_complete, nrow(data_complete) - 13*7)
test;train6. Forecast Model
Before forecasting the model, we must first scale the data to be proportionate. In this case I will use the library ‘recipes’ to conduct our scaling process.
library(recipes)
rec <- recipe(count~ ., train) %>%
step_sqrt(all_numeric()) %>%
step_center(all_numeric()) %>%
step_scale(all_numeric()) %>%
prep()train_scale <- juice(rec)
test_scale <- bake(rec,test)We then proceed to creating a multiseasonal model with our train and test models.
train.msts <- msts(train_scale$count, seasonal.periods = c(13,13*7))
test.msts <- msts(test_scale$count, seasonal.periods = c(13,13*7))Next, we proceed to create a multi-seasonal time series model and will forecast the model by using the ARIMA method.
# Make msts and forecast model
model.tbats <- tbats(y = train.msts, use.box.cox = FALSE, use.trend = FALSE,use.damped.trend = FALSE)
model.msts <- stlm(train.msts, method = "arima")
fc.msts <- forecast(model.msts, h = 13*7)
fc.tbats <- forecast(model.tbats, h = 13*7)We revert back the recipes model to original data in order to avoid forecast mismatching.
# revert back function
rec_revert <- function(vector, rec) {
# store recipe values
rec_center <- rec$steps[[2]]$means["count"]
rec_scale <- rec$steps[[3]]$sds["count"]
# convert back based on the recipe
results <- (vector * rec_scale + rec_center) ^ 2
# add additional adjustment if necessary
results <- round(results)
# return the results
results
}Next, we assign each revert data to a new data and set it as numeric, so that forecasting is able to be conducted.
rec_train <- as.numeric(rec_revert(vector = train.msts,rec = rec))
rec_test <- as.numeric(rec_revert(vector = test.msts,rec = rec))
rec_fc_msts <- as.numeric(rec_revert(vector = fc.msts$mean, rec=rec))
rec_fc_tbats <- as.numeric(rec_revert(vector = fc.tbats$mean, rec=rec))Then, we bake the data and create an msts model from our complete scaled data with the seasonal periods.
data_complete.scale <- bake(rec, data_complete)
msts.complete <- msts(data_complete.scale$count, seasonal.periods = c(13, 13*7))Lastly, we will plot our forecast model.
plot_forecast(fc.msts)The graph above shows the Actual data vs. Predicted(Estimated) data with forecasted, 80% confidence, and 95% confidence. With that, we can now begin our Model Evaluation.
7. Model Evaluation
# Model Evaluation
MLmetrics::MAE(rec_fc_msts, rec_test)#> [1] 5.615385
accuracy(rec_fc_msts, rec_test)#> ME RMSE MAE MPE MAPE
#> Test set -1.945055 7.287653 5.615385 -24.11606 35.21826
The MAE shows that our error is quite low at 5.57% which promotes a fairly good model.
8. Bind with Data Submission File
tail(data)# convert to multi-seasonal
msts_visitor <- msts(data_complete$count, seasonal.periods = c(13, 13*7))library(recipes)
rec_asli <- recipe(count~ ., data_complete) %>%
step_sqrt(all_numeric()) %>%
step_center(all_numeric()) %>%
step_scale(all_numeric()) %>%
prep()
rec_asli_juice <- juice(rec_asli)
msts_juice <- msts(data = rec_asli_juice$count,seasonal.periods = c(13,13*7))# Make msts and forecast model
model.msts <- stlm(msts_juice, method = "arima")
fc.msts <- forecast(model.msts, h = 13*7)model_revert <- as.numeric(rec_revert(vector = fc.msts$mean,rec = rec))) ```